In [1]:
import time
import datetime

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import math

import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
from plotly import tools

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

Data Collection

The data can be found at https://www.kaggle.com/airbnb/seattle/data

In [2]:
# Importing Data
calendar = pd.read_csv('seattle/calendar.csv')
reviews = pd.read_csv('seattle/reviews.csv')
listings = pd.read_csv('seattle/listings.csv')
In [3]:
calendar.columns
Out[3]:
Index(['listing_id', 'date', 'available', 'price'], dtype='object')
In [4]:
reviews.columns
Out[4]:
Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')
In [5]:
listings.loc[listings.id.isna()]
Out[5]:
id listing_url scrape_id last_scraped name summary space description experiences_offered neighborhood_overview ... review_scores_value requires_license license jurisdiction_names instant_bookable cancellation_policy require_guest_profile_picture require_guest_phone_verification calculated_host_listings_count reviews_per_month

0 rows × 92 columns

In [6]:
calendar.loc[calendar.listing_id.isna()]
Out[6]:
listing_id date available price
In [7]:
calendar.loc[calendar.date.isna()]
Out[7]:
listing_id date available price
In [8]:
listings.columns
Out[8]:
Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url',
       'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet',
       'price', 'weekly_price', 'monthly_price', 'security_deposit',
       'cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights',
       'maximum_nights', 'calendar_updated', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'calendar_last_scraped', 'number_of_reviews',
       'first_review', 'last_review', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'requires_license',
       'license', 'jurisdiction_names', 'instant_bookable',
       'cancellation_policy', 'require_guest_profile_picture',
       'require_guest_phone_verification', 'calculated_host_listings_count',
       'reviews_per_month'],
      dtype='object')

Data Cleaning

In [9]:
# change  price from sting to integer
calendar['price']=calendar['price'].replace('[\$,]', '', regex=True).astype(float)
# Change date from string to datetime object
calendar['date']=pd.to_datetime(calendar['date'])
In [10]:
# Change date from string to datetime object
reviews['date']=pd.to_datetime(reviews['date'])
In [11]:
# change  price from sting to integer
listings['price']=listings['price'].replace('[\$,]', '', regex=True).astype(float)
listings['weekly_price']=listings['weekly_price'].replace('[\$,]', '', regex=True).astype(float)
listings['monthly_price']=listings['monthly_price'].replace('[\$,]', '', regex=True).astype(float)

Questions:

  • Does the amount of available listings affect the price
  • What is the most common type of Airbnb living situation?
  • What is the most listed price range?

Does the amount of available listings affect the price?

Hypothesis: My hypothesis is that when there is a lower amount of available listings, the price would go up.

To test this I will do a simple linear regression to see if the F-statistic is below 0.05. This will indicate that the available listings do indeed have a significant impact on the list price.

Another part needed to reject the null hypothesis is to indicate that when the higher number of available listings will increase the average list price. To see if this hold true, the coefficient of available listings should be positive.

In [12]:
# Calculations necessary to get x and y columns for simple linear regression
Avail_calendarcounts_Daily=calendar.loc[calendar.available=='t'].groupby('date') \
                .agg({'price': 'mean', 'listing_id': pd.Series.nunique}).reset_index()

x_variable=[]
y_variable=list(Avail_calendarcounts_Daily.price)

# Putting x variable into the right format
for x in Avail_calendarcounts_Daily.listing_id:
    x_variable.append([x])

X = x_variable
y = y_variable

# Fitting and running Simple linear regression
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.371
Date:                Fri, 21 Jun 2019   Prob (F-statistic):              0.242
Time:                        00:38:24   Log-Likelihood:                -1352.0
No. Observations:                 365   AIC:                             2708.
Df Residuals:                     363   BIC:                             2716.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        129.4353      7.250     17.852      0.000     115.177     143.693
x1             0.0033      0.003      1.171      0.242      -0.002       0.009
==============================================================================
Omnibus:                       33.348   Durbin-Watson:                   0.141
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               10.791
Skew:                           0.009   Prob(JB):                      0.00454
Kurtosis:                       2.158   Cond. No.                     3.61e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.61e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

After doing the simple linear regression we can see that the R-squared statistic is only 0.004, which means the available apartments account for less than a percent of the variance in price. If we look at the F-statistic, we can also see that it is well above 0.05 which means that there is no significant relationship between the available apartments and the average list price. I, therefore, reject the null hypothesis and can say with confidence that in this dataset there is no clear relationship between list price and available apartments.

In [13]:
"""
Calculations for graph:
In order for the graph to look a little better I decided to group the 
data on a weekly basis rather than on a daily basis. 

I also decided to count unique listings available to include in the scatter plot.
    
"""
Avail_calendarcounts=calendar.loc[calendar.available=='t'].groupby(pd.Grouper(key='date', freq='W-MON')) \
                .agg({'price': 'mean', 'listing_id': pd.Series.nunique}).reset_index()

trace1 = go.Scatter(
    x=list(Avail_calendarcounts.date),
    y=list(Avail_calendarcounts.price),
    name='Price'
)
trace2 = go.Scatter(
    x=list(Avail_calendarcounts.date),
    y=list(Avail_calendarcounts.listing_id),
    name='Available apartments',
    yaxis='y2'
)
data = [trace1, trace2]
layout = go.Layout(
    title='Price vs available apartments',
    xaxis=dict(
        title='Date'
    ),
    yaxis=dict(
        title='Average Listing Price($)'
    ),
    yaxis2=dict(
        title='Apartments Available',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='multiple-axes-double')

To add onto the statistical test I did to look at the relationship, I also wanted some visual confirmation. Looking at the graph above one can see that the price clearly isn't much influenced by the available apartments. In some cases, price and available apartments would increase together but at other times they move in completely the opposite direction.

What is the most common type of Airbnb living situation?

find it very interesting how the Airbnb community choose to list their apartments. For me, the living situation can be very important and is probably the first thing most people consider when renting. For me, the living situation is mostly what type of building you live in (property type) as well as the type of sharing privileges you have (room type)

In [14]:
"""
For this graph I wanted to have a horisontal box plot graph. 

I also wanted to colour the bars based on the average price of the property type. 
"""
# Calculations for graph
property_type_listings=listings.groupby('property_type').agg({'id':'count','price':'mean'}) \
                        .sort_values(by='id',ascending=True).reset_index()

# Calculations for color
labels=['red','orange','blue','green']

property_type_listings['price_color']=pd.cut(property_type_listings.price,4,labels=labels)

# Draw figure
data = [go.Bar(
    x=property_type_listings.id,
    y=property_type_listings.property_type,
    orientation = 'h',
    marker=dict(color=property_type_listings['price_color'])
            
)]

layout = go.Layout(
    margin=dict(
        l=120,
        r=10,
        t=140,
        b=80
    ),
    showlegend=False,
    title='Most common property types',
    xaxis= dict(title= 'Amount of listings'),
    yaxis= dict(title= 'Property Type')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='horizontal-bar')
In [15]:
# Calculation to see what percentage house and apartment takes
combinedhouseandapartmet=property_type_listings.loc[property_type_listings.property_type.isin(['House','Apartment'])].id.sum()
alltypes=property_type_listings.id.sum()
print(round(100*combinedhouseandapartmet/alltypes,2),'%')
90.15 %

On this graph, one can see that two-thirds of the listings are for an entire home/apartment. Another 30% of the listings are private rooms and shared rooms only occupy 3% of the listings. To get an even greater understanding I think the best would be to look at room type/property type combinations.

In [16]:
# Calculations to draw room type pie chart
room_type_listings=listings.groupby('room_type').agg({'id':'count'}) \
                    .sort_values(by='id',ascending=True).reset_index()

roomtype = {
    "data": [
        {
            "values": room_type_listings.id,
            "labels": room_type_listings.room_type,
            "hole": .5,
            "type": "pie",
            "rotation":120
        },
    ],
    "layout": {
        "title":"Room Type Comparison",
    }
}
py.iplot(roomtype)

On this graph one can see that two thirds of the listings is for an entire home/apartment. Another 30% of the listings is private rooms and shared rooms only occupy 3% of the listings. To get an even greater understanding I think the best would be to look at room type/property type combinations.

In [25]:
"""
For this graph I only wanted to show the top most used living situations
along with the other group to get a better understanding of the data
"""
# Calculations to get values for pie chart
propandroom_type_combined=listings.groupby(['property_type','room_type']).agg({'id':'count'}) \
                        .sort_values(by='id',ascending=False).reset_index()

# Create a more readable name
propandroom_type_combined['name']=propandroom_type_combined.property_type+' | '+propandroom_type_combined.room_type

# Plotting graph
inclidedinlist=4
values=list(propandroom_type_combined[:inclidedinlist].id)
values.append(propandroom_type_combined[inclidedinlist:].id.sum())
labels=list(propandroom_type_combined[:inclidedinlist].name)
labels.append('Other')

pandr_type = {
    "data": [
        {
            "values": values,
            "labels": labels,
            "hole": .5,
            "type": "pie",
            "rotation":225
        },
    ],
    "layout": {
        "title":"Property and Room Type Combined Comparison (Top 4)",
    }
}
py.iplot(pandr_type)

As can be seen on the graph above, the most common listing situation is apartments that are entirely rented out. This accounts for 37% of all the listings. We can see that house listings are almost half and half when it come to room types, with the entire home being just a little bit more. It is pretty interesting to see that houses are more commenly shared by means of private rooms that apartments. This statistic does sound like it could make sense but I was pretty surprised to see that the house split was so simular. These statistics definately gives one a better understanding of the listings in Seattle.

What is the most listed price range?

The best way to visually represent this will be to display a histogram, so to accomplish this I will first have to create bins.

In [18]:
"""
Since only available listings has prices, I removed all the rows where listings where not available.
"""
availablelistings=calendar.loc[(calendar.available=='t')]
In [19]:
# Plot 75 bin histogram

data=[go.Histogram(
    x=availablelistings.price,
    xbins=dict(
        size=75
    ),   
    opacity=0.85     
  )]

layout = go.Layout(
    title='Bin Size 75',
    xaxis=dict(
        title='Price Range',
        range=[1, 600]
        
    ),
    yaxis=dict(
        title='Listings'
    ),
    bargap=0.1,
    bargroupgap=0.1
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)
In [20]:
# Plot 50 bin histogram

data=[go.Histogram(
    x=availablelistings.price,
    xbins=dict(
        size=50
    ),   
    opacity=0.85     
  )]

layout = go.Layout(
    title='Bin Size 50',
    xaxis=dict(
        title='Price Range',
        range=[1, 600]
        
    ),
    yaxis=dict(
        title='Listings'
    ),
    bargap=0.1,
    bargroupgap=0.1
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)
In [21]:
# Plot 25 bin histogram

data=[go.Histogram(
    x=availablelistings.price,
    xbins=dict(
        size=25
    ),   
    opacity=0.85     
  )]

layout = go.Layout(
    title='Bin Size 25',
    xaxis=dict(
        title='Price Range',
        range=[1, 600]
        
    ),
    yaxis=dict(
        title='Listings'
    ),
    bargap=0.1,
    bargroupgap=0.1
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig)
In [22]:
"""
To give a better understanding of how the data is split percentage wise I wanted to use a pie chart.
Since most of the data lies between 25-250 I decided to use the 25 bins for that ranges and then group the rest together.
I had a look at how much data was below 25 and there was only about 0.3% so I chose to discard it.
"""
# Creating bins from 25-250
bintuples=[]
j=25
for i in range(1,10):
    bintuples.append((i*j,(i+1)*j))

# Adding huge bin for eveything above 250
bintuples.append((250,1650))

bins=pd.IntervalIndex.from_tuples(bintuples)

# Assigning rows to bins based on price
availablelistings['price_bin']=pd.cut(availablelistings.price, bins)
availablelistings['price_bin_str']=availablelistings['price_bin'].apply(lambda bin: str(bin))

#Grouping data based on bins
binnedcounts=availablelistings.groupby(availablelistings.price_bin_str).agg({'listing_id':'count'}).reset_index()
In [23]:
binnedcounts
Out[23]:
price_bin_str listing_id
0 (25, 50] 77241
1 (50, 75] 166270
2 (75, 100] 203513
3 (100, 125] 124325
4 (125, 150] 109140
5 (150, 175] 56536
6 (175, 200] 51283
7 (200, 225] 24997
8 (225, 250] 33934
9 (250, 1650] 85423

In order to find the correct bin size I looked at 75,50 and 25. In the three graphs above one can see the available listings in each price range. I believe the $25 price range bin is the best as it looks like it splits the data very well. To do some further testing I looked at the top 7 bars as this contains the majority of the data.

In [24]:
pricerange = {
    "data": [
        {
            "values": binnedcounts.listing_id,
            "labels": binnedcounts.price_bin_str,
            "hole": .5,
            "type": "pie",
            "rotation":281
            
        },
    ],
    "layout": {
        "title":"Price range comparison",
    }
}
py.iplot(pricerange)

The pie chart above shows us exactly how the prices of listings are split up. Here is some stats that can be found on the chart:

  • We can see that the price range between 75 and a 100 dollars is the most posted listing price range. This range is then closely followed by the \$50 to \\$75 bin bucket which also confirms what we saw in the \$50 bin width.
  • Just a little over 50% of the listings are between \$50 and \\$125. This also reflects in the histogram where the bin width was \$75.
  • Another interesting stat that can be seen is that just under 10% of all the listings is over //$250, which is pretty interesting.
In [ ]: